From the course website:
import pandas as pd
import numpy as np
rng = np.random.default_rng(10)
names = ["Ava", "Benjamin", "Charlotte", "Daniel", "Emma", "Fredric", "Gianna"]
courses = ["MTH 141", "MTH 142", "MTH 241", "MTH 306", "MTH 309", "MTH 311"]
rooms = ["NSC 216", "Capen 110", "Park 440"]
# concat rows data
scores1 = rng.integers(0, 100, 12).reshape(4, 3)
scores2 = rng.integers(0, 100, 9).reshape(3, 3)
columns = ["problem_1", "problem_2", "problem_3"]
sec1 = pd.DataFrame(scores1, index=names[:4], columns=columns)
sec2 = pd.DataFrame(scores2, index=names[4:7], columns=columns)
# concat columns data
scores1 = rng.integers(0, 100, 8).reshape(4, 2)
scores2 = rng.integers(0, 100, 9).reshape(3, 3)
part1 = pd.DataFrame(scores1,
index=names[:4],
columns=["problem_1", "problem_2"])
part2 = pd.DataFrame(scores2,
index=names[:3],
columns=["problem_3", "problem_4", "problem_5"])
# merging data
office_nums = rng.integers(100, 150, len(names[:-1]))
courses = pd.DataFrame({"course": courses,
"instructor": rng.choice(names[1:], len(courses))})
instructors = pd.DataFrame({"name": names[:-1], "office": office_nums}, dtype="object")
display(sec1)
display(sec2)
| problem_1 | problem_2 | problem_3 | |
|---|---|---|---|
| Ava | 77 | 95 | 26 |
| Benjamin | 20 | 79 | 82 |
| Charlotte | 51 | 14 | 83 |
| Daniel | 51 | 15 | 13 |
| problem_1 | problem_2 | problem_3 | |
|---|---|---|---|
| Emma | 41 | 68 | 40 |
| Fredric | 84 | 0 | 42 |
| Gianna | 52 | 95 | 23 |
pd.concat([sec1, sec2])
| problem_1 | problem_2 | problem_3 | |
|---|---|---|---|
| Ava | 77 | 95 | 26 |
| Benjamin | 20 | 79 | 82 |
| Charlotte | 51 | 14 | 83 |
| Daniel | 51 | 15 | 13 |
| Emma | 41 | 68 | 40 |
| Fredric | 84 | 0 | 42 |
| Gianna | 52 | 95 | 23 |
df = pd.concat([sec1, sec2], keys=["section_1", "section_2"])
df
| problem_1 | problem_2 | problem_3 | ||
|---|---|---|---|---|
| section_1 | Ava | 77 | 95 | 26 |
| Benjamin | 20 | 79 | 82 | |
| Charlotte | 51 | 14 | 83 | |
| Daniel | 51 | 15 | 13 | |
| section_2 | Emma | 41 | 68 | 40 |
| Fredric | 84 | 0 | 42 | |
| Gianna | 52 | 95 | 23 |
df.loc["section_2"]
| problem_1 | problem_2 | problem_3 | |
|---|---|---|---|
| Emma | 41 | 68 | 40 |
| Fredric | 84 | 0 | 42 |
| Gianna | 52 | 95 | 23 |
df.loc[("section_2", "Gianna")]
problem_1 52 problem_2 95 problem_3 23 Name: (section_2, Gianna), dtype: int64
sec1 = sec1.rename({"problem_3": "problem_3a"}, axis=1)
sec1
| problem_1 | problem_2 | problem_3a | |
|---|---|---|---|
| Ava | 77 | 95 | 26 |
| Benjamin | 20 | 79 | 82 |
| Charlotte | 51 | 14 | 83 |
| Daniel | 51 | 15 | 13 |
sec2 = sec2.rename({"problem_3": "problem_3b"}, axis=1)
sec2
| problem_1 | problem_2 | problem_3b | |
|---|---|---|---|
| Emma | 41 | 68 | 40 |
| Fredric | 84 | 0 | 42 |
| Gianna | 52 | 95 | 23 |
pd.concat([sec1, sec2])
| problem_1 | problem_2 | problem_3a | problem_3b | |
|---|---|---|---|---|
| Ava | 77 | 95 | 26.0 | NaN |
| Benjamin | 20 | 79 | 82.0 | NaN |
| Charlotte | 51 | 14 | 83.0 | NaN |
| Daniel | 51 | 15 | 13.0 | NaN |
| Emma | 41 | 68 | NaN | 40.0 |
| Fredric | 84 | 0 | NaN | 42.0 |
| Gianna | 52 | 95 | NaN | 23.0 |
pd.concat([sec1, sec2], join="inner")
| problem_1 | problem_2 | |
|---|---|---|
| Ava | 77 | 95 |
| Benjamin | 20 | 79 |
| Charlotte | 51 | 14 |
| Daniel | 51 | 15 |
| Emma | 41 | 68 |
| Fredric | 84 | 0 |
| Gianna | 52 | 95 |
display(part1)
display(part2)
| problem_1 | problem_2 | |
|---|---|---|
| Ava | 82 | 7 |
| Benjamin | 33 | 74 |
| Charlotte | 57 | 93 |
| Daniel | 75 | 91 |
| problem_3 | problem_4 | problem_5 | |
|---|---|---|---|
| Ava | 82 | 13 | 93 |
| Benjamin | 84 | 14 | 97 |
| Charlotte | 74 | 31 | 13 |
pd.concat([part1, part2], axis=1)
| problem_1 | problem_2 | problem_3 | problem_4 | problem_5 | |
|---|---|---|---|---|---|
| Ava | 82 | 7 | 82.0 | 13.0 | 93.0 |
| Benjamin | 33 | 74 | 84.0 | 14.0 | 97.0 |
| Charlotte | 57 | 93 | 74.0 | 31.0 | 13.0 |
| Daniel | 75 | 91 | NaN | NaN | NaN |
display(courses)
display(instructors)
| course | instructor | |
|---|---|---|
| 0 | MTH 141 | Emma |
| 1 | MTH 142 | Charlotte |
| 2 | MTH 241 | Benjamin |
| 3 | MTH 306 | Gianna |
| 4 | MTH 309 | Charlotte |
| 5 | MTH 311 | Emma |
| name | office | |
|---|---|---|
| 0 | Ava | 119 |
| 1 | Benjamin | 145 |
| 2 | Charlotte | 139 |
| 3 | Daniel | 111 |
| 4 | Emma | 124 |
| 5 | Fredric | 142 |
schedule = pd.merge(courses, instructors, left_on="instructor", right_on="name", how="inner")
schedule
| course | instructor | name | office | |
|---|---|---|---|---|
| 0 | MTH 141 | Emma | Emma | 124 |
| 1 | MTH 311 | Emma | Emma | 124 |
| 2 | MTH 142 | Charlotte | Charlotte | 139 |
| 3 | MTH 309 | Charlotte | Charlotte | 139 |
| 4 | MTH 241 | Benjamin | Benjamin | 145 |
schedule.drop("name", axis=1)
| course | instructor | office | |
|---|---|---|---|
| 0 | MTH 141 | Emma | 124 |
| 1 | MTH 311 | Emma | 124 |
| 2 | MTH 142 | Charlotte | 139 |
| 3 | MTH 309 | Charlotte | 139 |
| 4 | MTH 241 | Benjamin | 145 |
pd.merge(courses, instructors, left_on="instructor", right_on="name", how="left")
| course | instructor | name | office | |
|---|---|---|---|---|
| 0 | MTH 141 | Emma | Emma | 124 |
| 1 | MTH 142 | Charlotte | Charlotte | 139 |
| 2 | MTH 241 | Benjamin | Benjamin | 145 |
| 3 | MTH 306 | Gianna | NaN | NaN |
| 4 | MTH 309 | Charlotte | Charlotte | 139 |
| 5 | MTH 311 | Emma | Emma | 124 |
pd.merge(courses, instructors, left_on="instructor", right_on="name", how="right")
| course | instructor | name | office | |
|---|---|---|---|---|
| 0 | NaN | NaN | Ava | 119 |
| 1 | MTH 241 | Benjamin | Benjamin | 145 |
| 2 | MTH 142 | Charlotte | Charlotte | 139 |
| 3 | MTH 309 | Charlotte | Charlotte | 139 |
| 4 | NaN | NaN | Daniel | 111 |
| 5 | MTH 141 | Emma | Emma | 124 |
| 6 | MTH 311 | Emma | Emma | 124 |
| 7 | NaN | NaN | Fredric | 142 |
pd.merge(courses, instructors, left_on="instructor", right_on="name", how="outer")
| course | instructor | name | office | |
|---|---|---|---|---|
| 0 | MTH 141 | Emma | Emma | 124 |
| 1 | MTH 311 | Emma | Emma | 124 |
| 2 | MTH 142 | Charlotte | Charlotte | 139 |
| 3 | MTH 309 | Charlotte | Charlotte | 139 |
| 4 | MTH 241 | Benjamin | Benjamin | 145 |
| 5 | MTH 306 | Gianna | NaN | NaN |
| 6 | NaN | NaN | Ava | 119 |
| 7 | NaN | NaN | Daniel | 111 |
| 8 | NaN | NaN | Fredric | 142 |
from zipfile import ZipFile
with ZipFile("names.zip", 'r') as z:
z.extractall(path="baby_names")
df = pd.read_csv("baby_names/yob1880.txt", names=["name", "sex", "count"])
df
| name | sex | count | |
|---|---|---|---|
| 0 | Mary | F | 7065 |
| 1 | Anna | F | 2604 |
| 2 | Emma | F | 2003 |
| 3 | Elizabeth | F | 1939 |
| 4 | Minnie | F | 1746 |
| ... | ... | ... | ... |
| 1995 | Woodie | M | 5 |
| 1996 | Worthy | M | 5 |
| 1997 | Wright | M | 5 |
| 1998 | York | M | 5 |
| 1999 | Zachariah | M | 5 |
2000 rows × 3 columns
import glob
file_names = glob.glob("baby_names/yob*.txt")
file_names
['baby_names/yob2000.txt', 'baby_names/yob2014.txt', 'baby_names/yob1938.txt', 'baby_names/yob1910.txt', 'baby_names/yob1904.txt', 'baby_names/yob1905.txt', 'baby_names/yob1911.txt', 'baby_names/yob1939.txt', 'baby_names/yob2015.txt', 'baby_names/yob2001.txt', 'baby_names/yob2017.txt', 'baby_names/yob2003.txt', 'baby_names/yob1907.txt', 'baby_names/yob1913.txt', 'baby_names/yob1898.txt', 'baby_names/yob1899.txt', 'baby_names/yob1912.txt', 'baby_names/yob1906.txt', 'baby_names/yob2002.txt', 'baby_names/yob2016.txt', 'baby_names/yob2012.txt', 'baby_names/yob2006.txt', 'baby_names/yob1902.txt', 'baby_names/yob1916.txt', 'baby_names/yob1889.txt', 'baby_names/yob1888.txt', 'baby_names/yob1917.txt', 'baby_names/yob1903.txt', 'baby_names/yob2007.txt', 'baby_names/yob2013.txt', 'baby_names/yob2005.txt', 'baby_names/yob2011.txt', 'baby_names/yob1915.txt', 'baby_names/yob1901.txt', 'baby_names/yob1929.txt', 'baby_names/yob1928.txt', 'baby_names/yob1900.txt', 'baby_names/yob1914.txt', 'baby_names/yob2010.txt', 'baby_names/yob2004.txt', 'baby_names/yob1973.txt', 'baby_names/yob1967.txt', 'baby_names/yob1998.txt', 'baby_names/yob1999.txt', 'baby_names/yob1966.txt', 'baby_names/yob1972.txt', 'baby_names/yob1958.txt', 'baby_names/yob1964.txt', 'baby_names/yob1970.txt', 'baby_names/yob1971.txt', 'baby_names/yob1965.txt', 'baby_names/yob1959.txt', 'baby_names/yob1961.txt', 'baby_names/yob1975.txt', 'baby_names/yob1949.txt', 'baby_names/yob1948.txt', 'baby_names/yob1974.txt', 'baby_names/yob1960.txt', 'baby_names/yob1976.txt', 'baby_names/yob1962.txt', 'baby_names/yob1989.txt', 'baby_names/yob1988.txt', 'baby_names/yob1963.txt', 'baby_names/yob1977.txt', 'baby_names/yob1952.txt', 'baby_names/yob1946.txt', 'baby_names/yob1991.txt', 'baby_names/yob1985.txt', 'baby_names/yob1984.txt', 'baby_names/yob1990.txt', 'baby_names/yob1947.txt', 'baby_names/yob1953.txt', 'baby_names/yob1979.txt', 'baby_names/yob1945.txt', 'baby_names/yob1951.txt', 'baby_names/yob1986.txt', 'baby_names/yob1992.txt', 'baby_names/yob1993.txt', 'baby_names/yob1987.txt', 'baby_names/yob1950.txt', 'baby_names/yob1944.txt', 'baby_names/yob1978.txt', 'baby_names/yob1940.txt', 'baby_names/yob1954.txt', 'baby_names/yob1968.txt', 'baby_names/yob1983.txt', 'baby_names/yob1997.txt', 'baby_names/yob1996.txt', 'baby_names/yob1982.txt', 'baby_names/yob1969.txt', 'baby_names/yob1955.txt', 'baby_names/yob1941.txt', 'baby_names/yob1957.txt', 'baby_names/yob1943.txt', 'baby_names/yob1994.txt', 'baby_names/yob1980.txt', 'baby_names/yob1981.txt', 'baby_names/yob1995.txt', 'baby_names/yob1942.txt', 'baby_names/yob1956.txt', 'baby_names/yob2009.txt', 'baby_names/yob1919.txt', 'baby_names/yob1931.txt', 'baby_names/yob1925.txt', 'baby_names/yob1886.txt', 'baby_names/yob1892.txt', 'baby_names/yob1893.txt', 'baby_names/yob1887.txt', 'baby_names/yob1924.txt', 'baby_names/yob1930.txt', 'baby_names/yob1918.txt', 'baby_names/yob2020.txt', 'baby_names/yob2008.txt', 'baby_names/yob1926.txt', 'baby_names/yob1932.txt', 'baby_names/yob1891.txt', 'baby_names/yob1885.txt', 'baby_names/yob1884.txt', 'baby_names/yob1890.txt', 'baby_names/yob1933.txt', 'baby_names/yob1927.txt', 'baby_names/yob1923.txt', 'baby_names/yob1937.txt', 'baby_names/yob1894.txt', 'baby_names/yob1880.txt', 'baby_names/yob1881.txt', 'baby_names/yob1895.txt', 'baby_names/yob1936.txt', 'baby_names/yob1922.txt', 'baby_names/yob2018.txt', 'baby_names/yob1934.txt', 'baby_names/yob1920.txt', 'baby_names/yob1908.txt', 'baby_names/yob1883.txt', 'baby_names/yob1897.txt', 'baby_names/yob1896.txt', 'baby_names/yob1882.txt', 'baby_names/yob1909.txt', 'baby_names/yob1921.txt', 'baby_names/yob1935.txt', 'baby_names/yob2019.txt']
frames = [pd.read_csv(f, names=["name", "sex", "count"]) for f in file_names]
df = pd.concat(frames)
df
| name | sex | count | |
|---|---|---|---|
| 0 | Emily | F | 25957 |
| 1 | Hannah | F | 23084 |
| 2 | Madison | F | 19968 |
| 3 | Ashley | F | 17997 |
| 4 | Sarah | F | 17706 |
| ... | ... | ... | ... |
| 32025 | Zyheem | M | 5 |
| 32026 | Zykel | M | 5 |
| 32027 | Zyking | M | 5 |
| 32028 | Zyn | M | 5 |
| 32029 | Zyran | M | 5 |
2020863 rows × 3 columns
df[df["name"] == "Emily"]
| name | sex | count | |
|---|---|---|---|
| 0 | Emily | F | 25957 |
| 20595 | Emily | M | 30 |
| 6 | Emily | F | 12652 |
| 26398 | Emily | M | 11 |
| 163 | Emily | F | 1308 |
| ... | ... | ... | ... |
| 120 | Emily | F | 2184 |
| 9881 | Emily | M | 6 |
| 172 | Emily | F | 1094 |
| 11 | Emily | F | 8229 |
| 27045 | Emily | M | 8 |
218 rows × 3 columns
f = file_names[0]
f
'baby_names/yob2000.txt'
"hello.there".split('.')[0]
'hello'
int(f.split(".")[0][-4:])
2000
years = [int(f.split(".")[0][-4:]) for f in file_names]
years
[2000, 2014, 1938, 1910, 1904, 1905, 1911, 1939, 2015, 2001, 2017, 2003, 1907, 1913, 1898, 1899, 1912, 1906, 2002, 2016, 2012, 2006, 1902, 1916, 1889, 1888, 1917, 1903, 2007, 2013, 2005, 2011, 1915, 1901, 1929, 1928, 1900, 1914, 2010, 2004, 1973, 1967, 1998, 1999, 1966, 1972, 1958, 1964, 1970, 1971, 1965, 1959, 1961, 1975, 1949, 1948, 1974, 1960, 1976, 1962, 1989, 1988, 1963, 1977, 1952, 1946, 1991, 1985, 1984, 1990, 1947, 1953, 1979, 1945, 1951, 1986, 1992, 1993, 1987, 1950, 1944, 1978, 1940, 1954, 1968, 1983, 1997, 1996, 1982, 1969, 1955, 1941, 1957, 1943, 1994, 1980, 1981, 1995, 1942, 1956, 2009, 1919, 1931, 1925, 1886, 1892, 1893, 1887, 1924, 1930, 1918, 2020, 2008, 1926, 1932, 1891, 1885, 1884, 1890, 1933, 1927, 1923, 1937, 1894, 1880, 1881, 1895, 1936, 1922, 2018, 1934, 1920, 1908, 1883, 1897, 1896, 1882, 1909, 1921, 1935, 2019]
df = pd.concat(frames, keys=years)
df
| name | sex | count | ||
|---|---|---|---|---|
| 2000 | 0 | Emily | F | 25957 |
| 1 | Hannah | F | 23084 | |
| 2 | Madison | F | 19968 | |
| 3 | Ashley | F | 17997 | |
| 4 | Sarah | F | 17706 | |
| ... | ... | ... | ... | ... |
| 2019 | 32025 | Zyheem | M | 5 |
| 32026 | Zykel | M | 5 | |
| 32027 | Zyking | M | 5 | |
| 32028 | Zyn | M | 5 | |
| 32029 | Zyran | M | 5 |
2020863 rows × 3 columns
df.loc[1880]
| name | sex | count | |
|---|---|---|---|
| 0 | Mary | F | 7065 |
| 1 | Anna | F | 2604 |
| 2 | Emma | F | 2003 |
| 3 | Elizabeth | F | 1939 |
| 4 | Minnie | F | 1746 |
| ... | ... | ... | ... |
| 1995 | Woodie | M | 5 |
| 1996 | Worthy | M | 5 |
| 1997 | Wright | M | 5 |
| 1998 | York | M | 5 |
| 1999 | Zachariah | M | 5 |
2000 rows × 3 columns
df.loc[2020]
| name | sex | count | |
|---|---|---|---|
| 0 | Olivia | F | 17535 |
| 1 | Emma | F | 15581 |
| 2 | Ava | F | 13084 |
| 3 | Charlotte | F | 13003 |
| 4 | Sophia | F | 12976 |
| ... | ... | ... | ... |
| 31266 | Zykell | M | 5 |
| 31267 | Zylus | M | 5 |
| 31268 | Zymari | M | 5 |
| 31269 | Zyn | M | 5 |
| 31270 | Zyran | M | 5 |
31271 rows × 3 columns
df = df.reset_index(level=0)
df
| level_0 | name | sex | count | |
|---|---|---|---|---|
| 0 | 2000 | Emily | F | 25957 |
| 1 | 2000 | Hannah | F | 23084 |
| 2 | 2000 | Madison | F | 19968 |
| 3 | 2000 | Ashley | F | 17997 |
| 4 | 2000 | Sarah | F | 17706 |
| ... | ... | ... | ... | ... |
| 32025 | 2019 | Zyheem | M | 5 |
| 32026 | 2019 | Zykel | M | 5 |
| 32027 | 2019 | Zyking | M | 5 |
| 32028 | 2019 | Zyn | M | 5 |
| 32029 | 2019 | Zyran | M | 5 |
2020863 rows × 4 columns
df = df.rename({"level_0": "year"}, axis=1)
df
| year | name | sex | count | |
|---|---|---|---|---|
| 0 | 2000 | Emily | F | 25957 |
| 1 | 2000 | Hannah | F | 23084 |
| 2 | 2000 | Madison | F | 19968 |
| 3 | 2000 | Ashley | F | 17997 |
| 4 | 2000 | Sarah | F | 17706 |
| ... | ... | ... | ... | ... |
| 32025 | 2019 | Zyheem | M | 5 |
| 32026 | 2019 | Zykel | M | 5 |
| 32027 | 2019 | Zyking | M | 5 |
| 32028 | 2019 | Zyn | M | 5 |
| 32029 | 2019 | Zyran | M | 5 |
2020863 rows × 4 columns
df = df.sort_values(by=["year"])
df
| year | name | sex | count | |
|---|---|---|---|---|
| 1638 | 1880 | Dayton | M | 8 |
| 1341 | 1880 | Vern | M | 19 |
| 1340 | 1880 | Stewart | M | 19 |
| 1339 | 1880 | Randolph | M | 19 |
| 1338 | 1880 | Lucien | M | 19 |
| ... | ... | ... | ... | ... |
| 20853 | 2020 | Hisham | M | 31 |
| 20854 | 2020 | Jahleel | M | 31 |
| 20855 | 2020 | Jameir | M | 31 |
| 20857 | 2020 | Jenesis | M | 31 |
| 4287 | 2020 | Medina | F | 32 |
2020863 rows × 4 columns
df.to_csv("baby_names.csv", index=False)
pd.read_csv("baby_names.csv")
| year | name | sex | count | |
|---|---|---|---|---|
| 0 | 1880 | Dayton | M | 8 |
| 1 | 1880 | Vern | M | 19 |
| 2 | 1880 | Stewart | M | 19 |
| 3 | 1880 | Randolph | M | 19 |
| 4 | 1880 | Lucien | M | 19 |
| ... | ... | ... | ... | ... |
| 2020858 | 2020 | Hisham | M | 31 |
| 2020859 | 2020 | Jahleel | M | 31 |
| 2020860 | 2020 | Jameir | M | 31 |
| 2020861 | 2020 | Jenesis | M | 31 |
| 2020862 | 2020 | Medina | F | 32 |
2020863 rows × 4 columns
%config InlineBackend.figure_format = 'retina'
import seaborn as sns
import matplotlib.pyplot as plt
tips = sns.load_dataset("tips")
tips.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
plt.scatter(tips["total_bill"], tips["tip"]);
With seaborn:
sns.relplot(data=tips,
x="total_bill",
y="tip"
);
Setting themes:
sns.set_theme(style="darkgrid", # darkgrid, whitegrid, dark, white, ticks
palette="colorblind", # deep, muted, pastel, bright, dark, colorblind
font_scale=1.3
)
sns.relplot(data=tips,
x="total_bill",
y="tip"
);
with sns.axes_style("whitegrid"):
sns.relplot(data=tips,
x="total_bill",
y="tip"
);
Controlling size:
sns.relplot(data=tips,
x="total_bill",
y="tip",
height=6,
aspect=2
);
sns.relplot(data=tips,
x="total_bill",
y="tip",
hue="day",
height=6,
aspect=2
);
sns.relplot(data=tips,
x="total_bill",
y="tip",
hue="day",
size="size",
sizes=(5,400),
height=6,
aspect=2
);
sns.relplot(data=tips,
x="total_bill",
y="tip",
hue="day",
size="size",
sizes=(5,400),
col="time",
height=6,
aspect=1
);
sns.relplot(data=tips,
x="total_bill",
y="tip",
hue="day",
size="size",
sizes=(5,400),
col="time",
row="sex",
height=6,
aspect=1,
);
g = sns.jointplot(data=tips, x="total_bill", y="tip", height=8)
g.set_axis_labels("Total bill", "Tip")
g.fig.suptitle('Restaurant data', y = 1.01);
plt.figure(figsize=(14,5))
plt.subplot(121)
sns.scatterplot(data=tips,
x="total_bill",
y="tip",
hue="day",
);
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.title('Restaurant data')
plt.plot([0,50], [0, 10], 'r-', lw=5)
plt.subplot(122)
plt.plot([1, 2, 3, 4], [3, 0, 2, 1], 'bo-');
import plotly.io as pio
pio.renderers.default = "notebook"
import plotly.express as px
fig = px.scatter(data_frame=tips,
x="total_bill",
y="tip",
title="Restaurant data",
width=800,
height=400
)
fig.show()
fig = px.scatter(data_frame=tips,
x="total_bill",
y="tip",
color="day",
title = "Restaurant data"
)
fig.show()
fig = px.scatter(data_frame=tips,
x="total_bill",
y="tip",
color="day",
size="size",
title = "Restaurant data"
)
fig.show()
fig = px.scatter(data_frame=tips,
x="total_bill",
y="tip",
color="day",
size="size",
facet_col="time",
title = "Restaurant data"
)
fig.show()
fig = px.scatter(data_frame=tips,
x="total_bill",
y="tip",
color="day",
size="size",
facet_col="time",
facet_row="sex",
title = "Restaurant data"
)
fig.show()
fig.write_html("my_plot.html")
gapminder = px.data.gapminder()
gapminder.head()
| country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 | AFG | 4 |
| 1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 | AFG | 4 |
| 2 | Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.100710 | AFG | 4 |
| 3 | Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.197138 | AFG | 4 |
| 4 | Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.981106 | AFG | 4 |
ac = gapminder[gapminder["country"].str[0] == "A"]
ac.head()
| country | continent | year | lifeExp | pop | gdpPercap | iso_alpha | iso_num | |
|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 | AFG | 4 |
| 1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 | AFG | 4 |
| 2 | Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.100710 | AFG | 4 |
| 3 | Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.197138 | AFG | 4 |
| 4 | Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.981106 | AFG | 4 |
fig = px.line(ac,
x="year",
y="gdpPercap",
color="country",
hover_name="country",
labels={"year" : "Year", # change x-axis label
"gdpPercap" : "GDP per capita", # change y-axis label
"country" : "Country name"}, # change legend title
title="Line plot"
)
fig.show()
tips
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
fig = px.bar(tips,
x="day",
y="tip",
color="sex",
barmode="group", # relative group overlay
category_orders = {"day" : ["Thur", "Fri", "Sat", "Sun"]}, # order days on x-axis
title="Bar plot"
)
fig.show()
# DataFrame with total tip amounts for a given day and sex
t = tips.groupby(["day", "sex"])["tip"].sum().reset_index()
display(t.head())
| day | sex | tip | |
|---|---|---|---|
| 0 | Thur | Male | 89.41 |
| 1 | Thur | Female | 82.42 |
| 2 | Fri | Male | 26.93 |
| 3 | Fri | Female | 25.03 |
| 4 | Sat | Male | 181.95 |
fig = px.bar(t,
x="day",
y="tip",
color="sex",
barmode="group", # relative group overlay
category_orders = {"day" : ["Thur", "Fri", "Sat", "Sun"]}, # order days on x-axis
title="Bar plot"
)
fig.show()
fig = px.strip(tips,
x="day",
y="tip",
color="sex",
category_orders = {"day" : ["Thur", "Fri", "Sat", "Sun"]}, # order days on x-axis
title = "Strip plot"
)
fig.show()
fig = px.box(tips,
x="day",
y="total_bill",
color="sex",
labels = {"total_bill" : "total bill"},
category_orders = {"day" : ["Thur", "Fri", "Sat", "Sun"]},
title="Box plot")
fig.show()
fig = px.histogram(tips,
x="total_bill",
labels = {"total_bill" : "total bill"},
nbins = 6,
#histnorm = "percent", # probability, percent, probability density
title = "Histogram"
)
fig.update_layout({"bargap": 0.02}) # add space between bars
fig.show()
fig = px.scatter_3d(data_frame=tips,
x = "total_bill",
y = "tip",
z = "size",
color = "day"
)
fig.show()
From the course website:
url = "https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv"
import pandas as pd
df = pd.read_csv(url)
df.head(5)
| code | state | category | total exports | beef | pork | poultry | dairy | fruits fresh | fruits proc | total fruits | veggies fresh | veggies proc | total veggies | corn | wheat | cotton | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AL | Alabama | state | 1390.63 | 34.4 | 10.6 | 481.0 | 4.06 | 8.0 | 17.1 | 25.11 | 5.5 | 8.9 | 14.33 | 34.9 | 70.0 | 317.61 |
| 1 | AK | Alaska | state | 13.31 | 0.2 | 0.1 | 0.0 | 0.19 | 0.0 | 0.0 | 0.00 | 0.6 | 1.0 | 1.56 | 0.0 | 0.0 | 0.00 |
| 2 | AZ | Arizona | state | 1463.17 | 71.3 | 17.9 | 0.0 | 105.48 | 19.3 | 41.0 | 60.27 | 147.5 | 239.4 | 386.91 | 7.3 | 48.7 | 423.95 |
| 3 | AR | Arkansas | state | 3586.02 | 53.2 | 29.4 | 562.9 | 3.53 | 2.2 | 4.7 | 6.88 | 4.4 | 7.1 | 11.45 | 69.5 | 114.5 | 665.44 |
| 4 | CA | California | state | 16472.88 | 228.7 | 11.1 | 225.4 | 929.95 | 2791.8 | 5944.6 | 8736.40 | 803.2 | 1303.5 | 2106.79 | 34.6 | 249.3 | 1064.95 |
df["exports_cotton"] = df["cotton"] > 0
df.head()
| code | state | category | total exports | beef | pork | poultry | dairy | fruits fresh | fruits proc | total fruits | veggies fresh | veggies proc | total veggies | corn | wheat | cotton | exports_cotton | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AL | Alabama | state | 1390.63 | 34.4 | 10.6 | 481.0 | 4.06 | 8.0 | 17.1 | 25.11 | 5.5 | 8.9 | 14.33 | 34.9 | 70.0 | 317.61 | True |
| 1 | AK | Alaska | state | 13.31 | 0.2 | 0.1 | 0.0 | 0.19 | 0.0 | 0.0 | 0.00 | 0.6 | 1.0 | 1.56 | 0.0 | 0.0 | 0.00 | False |
| 2 | AZ | Arizona | state | 1463.17 | 71.3 | 17.9 | 0.0 | 105.48 | 19.3 | 41.0 | 60.27 | 147.5 | 239.4 | 386.91 | 7.3 | 48.7 | 423.95 | True |
| 3 | AR | Arkansas | state | 3586.02 | 53.2 | 29.4 | 562.9 | 3.53 | 2.2 | 4.7 | 6.88 | 4.4 | 7.1 | 11.45 | 69.5 | 114.5 | 665.44 | True |
| 4 | CA | California | state | 16472.88 | 228.7 | 11.1 | 225.4 | 929.95 | 2791.8 | 5944.6 | 8736.40 | 803.2 | 1303.5 | 2106.79 | 34.6 | 249.3 | 1064.95 | True |
import plotly.express as px
fig = px.choropleth(df,
locationmode="USA-states",
locations="code",
color="exports_cotton",
scope="usa",
color_discrete_sequence = ["red", "lightgray"],
title = "States exporting cotton",
hover_name = "state",
hover_data={"exports_cotton": False, "code":False}
)
fig.update_layout(showlegend=False)
fig.show()
df["veggies_perc"] = (df["total veggies"]/df["total exports"])*100
df.head(5)
| code | state | category | total exports | beef | pork | poultry | dairy | fruits fresh | fruits proc | total fruits | veggies fresh | veggies proc | total veggies | corn | wheat | cotton | exports_cotton | veggies_perc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AL | Alabama | state | 1390.63 | 34.4 | 10.6 | 481.0 | 4.06 | 8.0 | 17.1 | 25.11 | 5.5 | 8.9 | 14.33 | 34.9 | 70.0 | 317.61 | True | 1.030468 |
| 1 | AK | Alaska | state | 13.31 | 0.2 | 0.1 | 0.0 | 0.19 | 0.0 | 0.0 | 0.00 | 0.6 | 1.0 | 1.56 | 0.0 | 0.0 | 0.00 | False | 11.720511 |
| 2 | AZ | Arizona | state | 1463.17 | 71.3 | 17.9 | 0.0 | 105.48 | 19.3 | 41.0 | 60.27 | 147.5 | 239.4 | 386.91 | 7.3 | 48.7 | 423.95 | True | 26.443270 |
| 3 | AR | Arkansas | state | 3586.02 | 53.2 | 29.4 | 562.9 | 3.53 | 2.2 | 4.7 | 6.88 | 4.4 | 7.1 | 11.45 | 69.5 | 114.5 | 665.44 | True | 0.319295 |
| 4 | CA | California | state | 16472.88 | 228.7 | 11.1 | 225.4 | 929.95 | 2791.8 | 5944.6 | 8736.40 | 803.2 | 1303.5 | 2106.79 | 34.6 | 249.3 | 1064.95 | True | 12.789445 |
fig = px.choropleth(df,
locationmode="USA-states",
scope="usa",
locations="code",
color="veggies_perc",
hover_name="state",
hover_data={"veggies_perc": ':.2f', "code": False},
labels = {"veggies_perc": "veggies %"},
color_continuous_scale= px.colors.sequential.tempo, #px.colors.sequential.Reds_r,
title="Percentage of vegetables in agricultural exports",
)
fig.update_traces(marker_line_color='white',
hovertemplate = '<b>%{hovertext}</b><br>Veggies exports: %{z:.2f}%<extra></extra>'
)
fig.show()
fig.data[0].hovertemplate
'<b>%{hovertext}</b><br>Veggies exports: %{z:.2f}%<extra></extra>'
dir(px.colors.sequential)
['Aggrnyl', 'Aggrnyl_r', 'Agsunset', 'Agsunset_r', 'Blackbody', 'Blackbody_r', 'Bluered', 'Bluered_r', 'Blues', 'Blues_r', 'Blugrn', 'Blugrn_r', 'Bluyl', 'Bluyl_r', 'Brwnyl', 'Brwnyl_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'Burg', 'Burg_r', 'Burgyl', 'Burgyl_r', 'Cividis', 'Cividis_r', 'Darkmint', 'Darkmint_r', 'Electric', 'Electric_r', 'Emrld', 'Emrld_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'Hot', 'Hot_r', 'Inferno', 'Inferno_r', 'Jet', 'Jet_r', 'Magenta', 'Magenta_r', 'Magma', 'Magma_r', 'Mint', 'Mint_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'Oryel', 'Oryel_r', 'Peach', 'Peach_r', 'Pinkyl', 'Pinkyl_r', 'Plasma', 'Plasma_r', 'Plotly3', 'Plotly3_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuRd', 'PuRd_r', 'Purp', 'Purp_r', 'Purples', 'Purples_r', 'Purpor', 'Purpor_r', 'Rainbow', 'Rainbow_r', 'RdBu', 'RdBu_r', 'RdPu', 'RdPu_r', 'Redor', 'Redor_r', 'Reds', 'Reds_r', 'Sunset', 'Sunset_r', 'Sunsetdark', 'Sunsetdark_r', 'Teal', 'Teal_r', 'Tealgrn', 'Tealgrn_r', 'Turbo', 'Turbo_r', 'Viridis', 'Viridis_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_cols', '_contents', '_k', '_swatches', '_swatches_continuous', 'algae', 'algae_r', 'amp', 'amp_r', 'deep', 'deep_r', 'dense', 'dense_r', 'gray', 'gray_r', 'haline', 'haline_r', 'ice', 'ice_r', 'matter', 'matter_r', 'solar', 'solar_r', 'speed', 'speed_r', 'swatches', 'swatches_continuous', 'tempo', 'tempo_r', 'thermal', 'thermal_r', 'turbid', 'turbid_r']